# define alternative package directoryr_on_cluster <-FALSEif (r_on_cluster ==TRUE) { bp <-"/???" computer <-"???".libPaths(paste0( bp,"???", computer ) )} else {if (grepl(x =getwd(), pattern ="carl")) { bp <-"/home/carl/Dokumente/06_projects/"# set a more recent R snapshot as source repo# r = getOption("repos") # r["CRAN"] = "https://mran.microsoft.com/snapshot/2022-12-08"# options(repos = r)# rm(r) }}#+ load.packages, include=Ffor (i inc("renv","data.table","brms","here","broom","jtools","broom.mixed","readxl","Hmisc","sparkline","ggplot2","scales","ggrepel","ggthemes","corrplot","cowplot","plotly") ) {suppressPackageStartupMessages(library(i, character.only =TRUE ))}# Check unsuccessful updates packages# old.packages()# Update packages to that snapshot# update.packages(# ask = FALSE, # checkBuilt = TRUE# )# ggplot themeggplot2::theme_set(theme_tufte(base_size =14) +theme(panel.background =element_rect(colour ="grey35"),plot.background =element_rect(fill ="white", colour =NA) ))# When within the Charite network or VPN, proxy settings need to be adaptedchariteVPN <-TRUEif (chariteVPN) {Sys.setenv(https_proxy ="proxy.charite.de:8080")Sys.setenv(http_proxy ="proxy.charite.de:8080")}# Knitr should use the project root and not the script location as rootknitr::opts_knit$set(root.dir =here())knitr::opts_knit$set(base.dir =here("scripts"))print(here())
data/raw-data/data2.txt: Used for the main analysis
data/raw-data/global burden disease region.xlsx: Gives countries for global regions used in the paper.
data/raw-data/owid-covid-data_220208_22c8de6.txt: Maybe the raw data
0.2.1 Explore the raw data
Find some information about the first data and use it to annotate or name it. It looks like only the file data/raw-data/data2.txt was used in the analysis.
Open Questions:
When was the data accessed?
Several variables have typos:
data_2: “Vccination21”
Check on basic descriptive statistics like missingness.
Show the code
# Hmisc function to quickly scan for NAs in datacontents(data_1)
data_1 Contents
Data frame:data_1
160237 observations and 67 variables, maximum # NAs:155098
Name
Class
Storage
NAs
iso_code
character
0
continent
character
0
location
character
0
date
IDate
integer
0
total_cases
double
2875
new_cases
double
2909
new_cases_smoothed
double
4060
total_deaths
double
20477
new_deaths
double
20310
new_deaths_smoothed
double
20440
total_cases_per_million
double
3607
new_cases_per_million
double
3641
new_cases_smoothed_per_million
double
4787
total_deaths_per_million
double
21196
new_deaths_per_million
double
21029
new_deaths_smoothed_per_million
double
21159
reproduction_rate
double
39346
icu_patients
double
137694
icu_patients_per_million
double
137694
hosp_patients
double
136956
hosp_patients_per_million
double
136956
weekly_icu_admissions
double
155098
weekly_icu_admissions_per_million
double
155098
weekly_hosp_admissions
double
149821
weekly_hosp_admissions_per_million
double
149821
new_tests
double
94096
total_tests
double
92909
total_tests_per_thousand
double
92909
new_tests_per_thousand
double
94096
new_tests_smoothed
double
78677
new_tests_smoothed_per_thousand
double
78677
positive_rate
double
83915
tests_per_case
double
84475
tests_units
character
0
total_vaccinations
double
117442
people_vaccinated
double
119464
people_fully_vaccinated
double
122214
total_boosters
double
145084
new_vaccinations
double
124688
new_vaccinations_smoothed
double
81607
total_vaccinations_per_hundred
double
117442
people_vaccinated_per_hundred
double
119464
people_fully_vaccinated_per_hundred
double
122214
total_boosters_per_hundred
double
145084
new_vaccinations_smoothed_per_million
double
81607
new_people_vaccinated_smoothed
double
82782
new_people_vaccinated_smoothed_per_hundred
double
82782
stringency_index
double
34430
population
double
1049
population_density
double
17606
median_age
double
27311
aged_65_older
double
28753
aged_70_older
double
28024
gdp_per_capita
double
26692
extreme_poverty
double
72237
cardiovasc_death_rate
double
28338
diabetes_prevalence
double
21427
female_smokers
double
57948
male_smokers
double
59416
handwashing_facilities
double
94165
hospital_beds_per_thousand
double
40958
life_expectancy
double
10666
human_development_index
double
28863
excess_mortality_cumulative_absolute
double
154777
excess_mortality_cumulative
double
154777
excess_mortality
double
154777
excess_mortality_cumulative_per_million
double
154777
Show the code
contents(data_2)
data_2 Contents
Data frame:data_2
109 observations and 41 variables, maximum # NAs:78
Name
Storage
NAs
location
character
0
GBDR7
character
0
Testing20
double
0
Incidence20
double
0
Mortality20
double
0
Vaccination20
double
78
Testing21
double
0
Incidence21
double
0
Mortality21
double
0
Vccination21
double
0
max20_21_tests
double
0
max20_21_cases
double
0
max20_21_deaths
double
0
max20_21_peopleVaccinated
double
0
Age
double
0
GE
double
0
VA
double
0
PS
double
0
RQ
double
0
RL
double
0
CC
double
0
LEB
double
0
EYS
double
0
MYS
double
0
GNI
double
0
Gini
double
0
Urban
double
0
CHE
double
0
MMR
integer
0
FLB
double
0
ABR
double
0
SSP
double
0
PSE
double
0
EP
double
0
VE
double
0
Population
integer
0
PD
double
0
AAG
double
0
GDP
double
0
GI
double
0
HDI
double
0
Show the code
contents(data_3)
data_3 Contents
Data frame:data_3
113 observations and 5 variables, maximum # NAs:6
Name
Storage
NAs
Country.Territory
character
0
Code
character
0
Region
character
0
GBDR 21
character
6
GBDR7
character
0
This describe() call would be nice but currently the rendering fails when loading qreport package.
Show the code
d <-describe(data_2)html(d, size =80, scroll =TRUE)
Summaries can also quickly be created using the movStats() function
Show the code
# A shorthand for calculating summaries using data.tablewriteLines("Number of tests 2020 per meta-region:")
Check the data sources. data_2 contains per-country information on testing, indidences, mortality etc.
Show the code
# Check number of locationswriteLines("Total number of countries:")
Total number of countries:
Show the code
data_2[, uniqueN(location)]
[1] 109
Show the code
# Number of entries per locationwriteLines("Countries per meta-region:")
Countries per meta-region:
Show the code
data_2[, .N, GBDR7]
GBDR7
N
Central Europe, Eastern Europe, and Central Asia
22
High-Income
31
Latin America and Caribbean
16
North Africa and Middle East
10
Southeast Asia, East Asia, and Oceania
12
Sub-Saharan Africa
18
Create a long-form version of the data and print summary information.
Show the code
# long format helps evaluate each variabledata_2_long <-melt(data_2, id.vars =c("location", "GBDR7")) |>suppressWarnings()writeLines("Variables and missings per meta-region:")
We have several indices, some are a combination of others, like the HDI or GI.
Show the code
all_indices <-c("HDI","LEB","EYS","MYS","GNI","GDP","CHE","GI","MMR","ABR","FLB","PSE","Gini","UP","PD","EP","VE","AAG")# I want the column names to match the proper index namesall_indices[!(all_indices %in%colnames(data_2))]
[1] "UP"
Show the code
# Correct typo in namesetnames(data_2, "Vccination21", "Vaccination21")# UPsetnames(x = data_2, "Urban", "UP")# I want to remember what indices are for what and how they are connectedindices_structured <-list(Wealth =list("HDI"=c("LEB", "EYS", "MYS", "GNI"), "GDP"=c(), "CHE"=c() ),Inequality =list("GI"=c("MMR", "ABR", "FLB", "PSE"),"Gini"=c() ),Demographic_SES =list("UP"=c(),"PD"=c(),"EP"=c(),"VE"=c() ),Governance =list("AAG"=c()),"Mortality"=c(),"Incidence"=c(),"Vaccination"=c(),"Test_Capacity"=c())
0.3.1 Index correlation
I can plot basic overviews of the predictor data. First plot correlation of all variables.
# For plotting by year I need to melt the data by yeardata_2_year <-melt( data_2, id.vars =c("location", "GBDR7"), measure.vars =c("Testing20", "Testing21"), variable.name ="Year", value.name ="Tests")data_2_year[, Year :=ifelse(Year =="Testing20", "2020", "2021")]# Get log of PD for plottingdata_2$log_PD <-log10(data_2$PD)# select my interesting indicesnci <-c("CHE", "UP", "log_PD", "EP", "GI", "VE", "Gini")# Add the indicesm1 <-match(data_2_year$location, data_2$location)data_2_year[, c(nci, "Population") := data_2[(m1), .SD, .SDcols =c((nci), "Population")]]# Plot data with their meandata_2_plot <-copy(data_2_year)# Melt again to plot all indices at the same timedata_2_plot <-melt(data_2_plot, measure.vars = nci, variable.name ="Index", value.name ="Value", variable.factor =FALSE)
Per-region display to illustrate the effect difference within the regions.
1 TODO Add vaccination plot
Show the code
# Simpsons Paradox plot_list <-vector("list", length(nci))names(plot_list) <- ncifor (i inseq_along(names(plot_list))) { index <-names(plot_list)[i]# i <- "GI" p <-ggplot(data_2_plot[Index == (index)], aes(x = Value, y = Tests, col = GBDR7, size = Population) ) +geom_point() +facet_grid(Year ~ GBDR7) +scale_y_continuous(trans ="log10",labels =trans_format("log10",math_format(10^.x))) +annotation_logticks(sides ="l", outside = T, long = grid::unit(1.5, "mm"),mid = grid::unit(0, "mm"),short = grid::unit(0, "mm") ) +coord_cartesian(clip ="off") +labs(y ="Testing", x = index) +guides(size ="none") +theme(legend.position ="bottom", legend.direction ="vertical",legend.justification ="left", axis.ticks.y =element_blank(),strip.text.x =element_blank()) +geom_smooth(method ="lm", aes(x = Value, y = Tests, group = GBDR7), col ="indianred4",formula ="y ~ x")# Remove all but the last legend for plottingif (i !=length(names(plot_list))) { p <- p +theme(legend.position ="none") }# Add to list plot_list[[index]] <- p}# Extract the color legend to plot separatelylegend <-get_legend(# create some space to the left of the legendlast(plot_list))# Remove from the plot to only plot extracted plot_list[[7]] <- plot_list[[7]] +theme(legend.position ="none")plot_list[["Legend"]] <- legend# Plottingcowplot::plot_grid(plotlist = plot_list, ncol =1)
GI on a *10 scale makes the effect more interpretable (effect of changes +- 0.1 instead of +- 1)
Show the code
# Use log of PDdata_2$log_PD <-log10(data_2$PD)# Rescale the GI to get the effect estimates in terms of 0.1 increase of GIdata_2$GI_10 <- data_2$GI *10# compare raw with scaled datapar(mfrow =c(2,2))data_2[, plot(log2(Testing20) ~ PD, pch =19)]